library(dtwclust)
library(dtw)
df<-read.csv("../clean_data/gap_list_year.csv")
df
df <- subset(df, select = -c(X) )
df
#colnames(df)[colnames(df) == "X2000"] = "2000"
colnames(df)[colnames(df) == "X2003"] = "2003"
colnames(df)[colnames(df) == "X2005"] = "2005"
colnames(df)[colnames(df) == "X2007"] = "2007"
colnames(df)[colnames(df) == "X2009"] = "2009"
colnames(df)[colnames(df) == "X2011"] = "2011"
colnames(df)[colnames(df) == "X2013"] = "2013"
colnames(df)[colnames(df) == "X2015"] = "2015"
colnames(df)[colnames(df) == "X2017"] = "2017"
colnames(df)[colnames(df) == "X2019"] = "2019"
colnames(df)[colnames(df) == "X2022"] = "2022"
df
jurisdiction = df[['Jurisdiction']]
dtw_df <- df[, -1]
dtw_df
df_lst <- tslist(dtw_df)
remove_nan <- function(ts) {
ts[!is.na(ts)]
}
# Apply the function to each time series in the list
df_lst <- lapply(df_lst, remove_nan)
head(df_lst)
$`1`
[1] 2 -3 2 -1 0 0 -1 3 -3 1
$`2`
[1] 2 2 -1 0 -1 1 -1 0 5 3
$`3`
[1] 0 0 3 3 6 1 0 5 -2 2
$`4`
[1] -2 -3 0 -2 2 -1 0 1 -3 2
$`5`
[1] 2 1 0 4 0 2 -1 0 -2 1
$`6`
[1] 1 0 1 3 -1 -1 3 0 1 2
df_cvi <- list()
for (i in 2:10){
df_clust <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw_basic", centroid = "pam")
df_metric <- cvi(df_clust, type = "valid", log.base = 10)
df_cvi <- append(df_cvi, list(df_metric))
}
df_cvi_ma <- do.call(rbind, df_cvi)
rw <- c("K2","K3","K4","K5","K6","K7","K8","K9","K10")
rownames(df_cvi_ma) <- rw
print(df_cvi_ma)
Sil SF CH DB DBstar D COP
K2 0.173608338 5.754619e-11 18.629393 1.850840 1.850840 0.17021277 0.5083717
K3 0.061994057 1.776357e-15 11.801980 1.794359 1.996933 0.13636364 0.4786862
K4 0.017353712 0.000000e+00 15.820765 1.794676 1.821779 0.08571429 0.4622149
K5 0.039732944 0.000000e+00 9.707224 2.040603 2.324964 0.12820513 0.3938292
K6 0.033677146 0.000000e+00 5.932584 1.978748 2.067152 0.14705882 0.3775082
K7 0.059988611 0.000000e+00 6.868439 1.921871 2.228551 0.16129032 0.3309081
K8 0.004863317 0.000000e+00 4.975106 1.671385 1.911485 0.12820513 0.3679673
K9 -0.013875696 0.000000e+00 5.088627 1.481611 1.663201 0.22580645 0.3468591
K10 0.005822864 0.000000e+00 4.517112 1.577752 1.877663 0.19354839 0.3227653
– “Sil” (!): Silhouette index (Rousseeuw (1987); to be maximized).-K4 – “SF” (~): Score Function (Saitta et al. (2007); to be maximized; see notes). – “CH” (~): Calinski-Harabasz index (Arbelaitz et al. (2013); to be maximized).-k3 – “DB” (?): Davies-Bouldin index (Arbelaitz et al. (2013); to be minimized).k4 – “DBstar” (?): Modified Davies-Bouldin index (DB*) (Kim and Ramakrishna (2005); to be minimized). -k4 – “D” (!): Dunn index (Arbelaitz et al. (2013); to be maximized). k5 – “COP” (!): COP index (Arbelaitz et al. (2013); to be minimized). k9
# different seeds
df_cvi2 <- list()
for (i in 1:100){
df_clust2 <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw_basic", centroid = "pam", seed=i)
df_metric2 <- cvi(df_clust2, type = "valid", log.base = 10)
df_cvi2 <- append(df_cvi2, list(df_metric2))
}
df_cvi_ma2 <- do.call(rbind, df_cvi2)
rw2 <- as.character(seq(1, 100))
rownames(df_cvi_ma2) <- rw2
print(df_cvi_ma2)
Sil SF CH DB DBstar D COP
1 0.1112282281 0.000000e+00 11.063717 1.350185 1.350185 0.17142857 0.4077778
2 0.0041846999 0.000000e+00 10.146805 3.636564 4.424066 0.06818182 0.4452244
3 0.0788971345 0.000000e+00 9.184859 1.911920 1.983610 0.14285714 0.4040440
4 0.0215495924 1.554312e-15 8.909565 1.554929 1.688633 0.15909091 0.4428019
5 0.0739284731 0.000000e+00 9.860140 1.506246 1.506246 0.20000000 0.4183652
6 0.1451902673 0.000000e+00 10.379167 1.569420 1.673541 0.29032258 0.4093995
7 0.1047522976 0.000000e+00 11.165746 1.902178 2.032550 0.17948718 0.4064280
8 0.0520865298 0.000000e+00 14.409142 1.586752 1.697970 0.13333333 0.4431176
9 0.0777696442 1.221245e-14 10.416368 1.465499 1.641786 0.15909091 0.4309232
10 0.0127504151 0.000000e+00 11.693419 2.168782 2.316793 0.06521739 0.4610232
11 -0.0255270726 0.000000e+00 11.694488 1.766323 1.852051 0.04545455 0.4998560
12 0.0240465824 1.776357e-15 9.432247 1.824725 2.091366 0.13636364 0.4550701
13 0.0645635119 0.000000e+00 12.437010 1.849255 2.121703 0.11363636 0.4281500
14 0.0626394552 0.000000e+00 7.968391 2.158141 2.337333 0.14705882 0.4311345
15 0.1332875620 0.000000e+00 7.470555 1.564762 1.665884 0.25806452 0.4308629
16 0.0385685305 0.000000e+00 9.857992 2.283181 2.683603 0.11363636 0.4254024
17 0.0153777511 0.000000e+00 7.673469 2.457369 2.571175 0.13513514 0.4408870
18 0.0391338403 3.401368e-11 7.427935 1.380679 1.491625 0.12765957 0.4878772
19 0.0884496899 4.884981e-15 9.255042 1.563378 1.716191 0.11363636 0.4378289
20 0.0667292905 0.000000e+00 9.089875 1.792573 1.874018 0.11904762 0.4431073
21 0.0270445846 0.000000e+00 13.856897 1.899701 2.156261 0.13636364 0.4380518
22 0.0315465882 0.000000e+00 12.182088 2.114465 2.529093 0.11363636 0.4262321
23 0.0086302916 0.000000e+00 9.405468 2.161295 2.484505 0.11363636 0.4375804
24 0.0685063306 0.000000e+00 10.183333 1.869046 2.022797 0.15909091 0.4312686
25 0.0313940154 0.000000e+00 9.044674 1.888079 1.942901 0.21212121 0.4312869
26 0.0705809471 0.000000e+00 7.518906 2.154398 2.238363 0.15151515 0.4281232
27 0.0462248499 0.000000e+00 6.908565 2.403041 2.795157 0.14705882 0.4236158
28 0.0224723209 0.000000e+00 10.769142 2.117756 2.543446 0.11363636 0.4542806
29 0.0500295013 0.000000e+00 10.714881 1.559701 1.713680 0.15909091 0.4313073
30 0.0218337296 0.000000e+00 13.147028 1.641133 1.828079 0.13333333 0.4929890
31 0.0616413974 0.000000e+00 9.571009 2.059298 2.233368 0.13636364 0.4433064
32 0.0940169839 0.000000e+00 16.391736 1.737271 1.801854 0.15789474 0.4414424
33 0.0363252436 0.000000e+00 10.332739 1.738534 1.875788 0.13636364 0.4281135
34 0.0931447643 0.000000e+00 9.256320 1.649674 1.799343 0.22580645 0.4109802
35 0.0482611829 0.000000e+00 10.782837 1.790332 2.049092 0.11363636 0.4429319
36 0.0680438269 0.000000e+00 9.444054 2.033923 2.190076 0.20588235 0.4088719
37 0.0083488854 0.000000e+00 8.721649 2.421450 2.817230 0.10638298 0.4528598
38 0.0115946974 0.000000e+00 10.198371 3.807359 4.685579 0.06818182 0.4433031
39 0.0871975926 0.000000e+00 14.501153 1.702952 1.941156 0.16129032 0.4003596
40 0.0499908919 0.000000e+00 6.793277 2.445833 2.593750 0.12820513 0.4746433
41 0.0438529631 0.000000e+00 8.111675 2.325271 2.521818 0.12820513 0.4307836
42 0.0671203069 0.000000e+00 9.193287 2.493307 3.043052 0.16129032 0.4140920
43 0.0591755023 3.774758e-15 7.590230 1.620416 1.832212 0.19047619 0.4582356
44 0.1148700794 0.000000e+00 9.117718 1.528436 1.664563 0.22857143 0.4009439
45 0.0761434590 0.000000e+00 9.795145 1.615673 1.719863 0.15909091 0.4266241
46 0.0687796553 0.000000e+00 10.820738 1.980544 2.124735 0.14705882 0.4107204
47 0.1005170754 0.000000e+00 9.777778 1.886886 2.005943 0.23076923 0.4327734
48 0.0582042115 4.662937e-15 14.237613 1.628461 1.676821 0.11363636 0.4500207
49 0.1298267133 0.000000e+00 9.168717 1.510723 1.568971 0.19354839 0.4171289
50 0.0798197277 0.000000e+00 9.959720 1.666256 1.896645 0.20454545 0.4348710
51 0.0009559613 0.000000e+00 10.029982 2.223748 2.611270 0.11363636 0.4459007
52 0.0709743973 0.000000e+00 10.051865 1.954863 2.082251 0.18421053 0.4191192
53 0.0632137794 2.664535e-15 15.009677 1.655582 1.716937 0.13953488 0.4894152
54 0.1497421787 0.000000e+00 10.622902 1.592817 1.610797 0.29032258 0.4048384
55 0.0324855426 0.000000e+00 12.385823 1.736111 1.739430 0.11111111 0.4787378
56 0.0146942484 4.440892e-16 12.258658 1.805516 1.852024 0.04545455 0.4557667
57 0.0654299909 0.000000e+00 9.065851 2.360674 2.774967 0.14285714 0.4038439
58 0.0271677050 0.000000e+00 13.732812 2.252614 2.370302 0.06976744 0.4751030
59 0.1055818837 0.000000e+00 10.472826 1.790872 1.884655 0.15384615 0.4206198
60 0.1556148835 0.000000e+00 8.303887 1.552256 1.580529 0.25806452 0.4166558
61 0.0905412950 0.000000e+00 9.357224 1.738180 1.738180 0.13513514 0.4259896
62 0.0582925685 0.000000e+00 9.930340 1.630952 1.719841 0.20000000 0.4201360
63 0.0766126838 0.000000e+00 12.020606 1.806696 1.897692 0.12820513 0.4221634
64 0.0464411720 0.000000e+00 14.768577 1.663553 1.781812 0.15555556 0.4779881
65 0.0909414903 0.000000e+00 11.528840 1.418433 1.437282 0.17142857 0.3956364
66 -0.0060557912 0.000000e+00 8.058658 2.168177 2.493830 0.11363636 0.4764581
67 0.0625068038 0.000000e+00 10.351190 1.936479 2.168155 0.11363636 0.4202327
68 0.0313318207 0.000000e+00 5.941653 2.818303 2.986775 0.10869565 0.4603765
69 0.0172526717 0.000000e+00 13.200617 1.520602 1.600116 0.11111111 0.4964637
70 0.0721628843 8.215650e-15 10.156898 1.548354 1.694417 0.12820513 0.4401819
71 0.1250389180 0.000000e+00 9.046948 2.064878 2.171170 0.30000000 0.4143914
72 0.0973242931 0.000000e+00 8.355556 1.992381 2.117405 0.14285714 0.4353311
73 0.0571762540 0.000000e+00 12.777962 1.574130 1.613624 0.08333333 0.4557690
74 0.1071860535 0.000000e+00 11.194598 1.876768 2.038889 0.17948718 0.4093205
75 0.0993286665 0.000000e+00 11.913491 1.515636 1.624253 0.15151515 0.3979925
76 -0.0463636253 0.000000e+00 14.100000 2.094810 2.094810 0.11320755 0.4932336
77 0.1295115413 0.000000e+00 9.344048 1.424020 1.511013 0.22857143 0.4083611
78 0.0764030732 0.000000e+00 9.372989 2.054399 2.168087 0.15625000 0.4320451
79 0.0609736509 0.000000e+00 9.023372 1.881684 1.972475 0.20588235 0.4514396
80 0.1012184250 0.000000e+00 7.886441 1.759125 1.801227 0.25714286 0.4431595
81 0.0614671056 0.000000e+00 11.275663 1.908358 2.057292 0.15909091 0.4285900
82 0.0845856821 0.000000e+00 10.097546 2.009993 2.134381 0.14705882 0.4091480
83 0.1035621430 0.000000e+00 8.355556 2.178945 2.353645 0.14705882 0.3994148
84 0.0703882555 0.000000e+00 10.861111 1.794416 1.912441 0.17142857 0.4064970
85 0.0587514067 0.000000e+00 17.556114 1.802925 1.833109 0.16279070 0.4405691
86 0.0670034171 0.000000e+00 10.963933 1.795935 1.795935 0.13513514 0.4331160
87 0.0985907593 0.000000e+00 7.558480 1.885787 1.899676 0.16129032 0.4196452
88 0.0035867674 0.000000e+00 10.670711 3.732429 4.501279 0.06818182 0.4412508
89 0.0653294062 0.000000e+00 10.397983 2.129099 2.340720 0.12820513 0.4317265
90 0.1211905231 0.000000e+00 9.394474 1.601427 1.630948 0.17142857 0.4152712
91 0.0919316023 0.000000e+00 9.760908 1.779546 1.959368 0.20454545 0.4397712
92 0.0495915916 0.000000e+00 7.967008 1.853720 1.995094 0.17948718 0.4490838
93 0.0780603940 0.000000e+00 9.545611 2.065371 2.381795 0.11363636 0.4293064
94 0.0676466192 0.000000e+00 8.572327 3.010248 3.010248 0.14285714 0.4297161
95 0.0524458358 0.000000e+00 12.353552 1.770990 2.088241 0.10869565 0.4206662
96 0.0652147235 0.000000e+00 6.832482 2.403949 2.575542 0.15151515 0.4444794
97 0.0993286665 0.000000e+00 11.913491 1.515636 1.624253 0.15151515 0.3979925
98 0.0531493699 0.000000e+00 7.367716 2.353304 2.557681 0.13513514 0.4094701
99 0.0940022225 8.215650e-15 9.728944 1.543721 1.690691 0.13636364 0.4330375
100 0.0735107443 0.000000e+00 9.481034 1.513273 1.605940 0.14705882 0.4169019
for (i in 2:10){df_clust_opt <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt)}
# for (i in 1:10){df_clust_opt_seed <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
# plot(df_clust_opt_seed)}
# for (i in 11:20){df_clust_opt_seed <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
# plot(df_clust_opt_seed)}
# for (i in 21:30){df_clust_opt_seed <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
# plot(df_clust_opt_seed)}
#k4
df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt_final)
# Extract cluster assignments
cluster_assignments <- df_clust_opt_final@cluster
# Determine the number of clusters
num_clusters <- max(cluster_assignments)
# Loop through each cluster and print the jurisdictions in it
for (cluster_number in 1:num_clusters) {
cat("Jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Print the jurisdictions corresponding to these indices
print(jurisdiction[indices_in_cluster])
cat("\n") # Add a newline for readability
}
Jurisdictions in Cluster 1 :
[1] "Hawaii" "North Carolina" "South Carolina" "Tennessee"
Jurisdictions in Cluster 2 :
[1] "Arizona" "Connecticut" "Maryland" "Michigan" "Missouri" "Montana" "Nevada" "North Dakota" "Pennsylvania"
[10] "Vermont" "Wisconsin"
Jurisdictions in Cluster 3 :
[1] "Alabama" "Alaska" "California" "Florida" "Illinois" "Indiana" "Kentucky" "Louisiana" "Maine"
[10] "Massachusetts" "Nebraska" "New York" "Ohio" "Oklahoma" "Oregon" "South Dakota" "Texas" "Wyoming"
Jurisdictions in Cluster 4 :
[1] "Arkansas" "Colorado" "Delaware" "Georgia" "Idaho" "Iowa" "Kansas" "Minnesota" "Mississippi"
[10] "National" "New Hampshire" "New Jersey" "New Mexico" "Rhode Island" "Utah" "Virginia" "Washington" "West Virginia"
# Create an empty dataframe to store the results
jurisdiction_clusters <- data.frame(Jurisdiction = character(), G8_Cluster = numeric(), stringsAsFactors = FALSE)
# Loop through each cluster and append the jurisdictions and their cluster number to the dataframe
for (cluster_number in 1:num_clusters) {
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Extract the jurisdictions corresponding to these indices
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Create a temporary dataframe for this cluster
temp_df <- data.frame(Jurisdiction = jurisdictions_in_cluster, G8_Cluster = rep(cluster_number, length(jurisdictions_in_cluster)), stringsAsFactors = FALSE)
# Append the temporary dataframe to the main dataframe
jurisdiction_clusters <- rbind(jurisdiction_clusters, temp_df)
}
# Export the dataframe to a CSV file
write.csv(jurisdiction_clusters, "../clean_data/jurisdiction_clusters_G8.csv", row.names = FALSE)
# View the resulting dataframe
print(jurisdiction_clusters)
# Load necessary libraries
library(ggplot2)
library(reshape2)
# Loop through each cluster
for (cluster_number in 1:num_clusters) {
cat("Plotting jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Get the names of the jurisdictions in this cluster
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Filter the gap_list_year data frame for these jurisdictions
cluster_data <- df[df$Jurisdiction %in% jurisdictions_in_cluster, ]
# Convert the data to long format for ggplot
long_df <- melt(cluster_data, id.vars = "Jurisdiction", variable.name = "Year", value.name = "Value")
# Plot
p <- ggplot(long_df, aes(x = Year, y = Value, group = Jurisdiction, color = Jurisdiction)) +
geom_line() +
labs(title = paste("Cluster", cluster_number), x = "Year", y = "Gap") +
theme(legend.position = "right")
print(p)
#ggsave(paste("cluster_", cluster_number, ".png", sep=""), plot = p)
}
Plotting jurisdictions in Cluster 1 :
Plotting jurisdictions in Cluster 2 :
Plotting jurisdictions in Cluster 3 :
Plotting jurisdictions in Cluster 4 :